Import the libraries
library(tidyverse)
library(rvest)
library(httr)
library(maps)
library(readxl)
library(plotly)
library(choroplethr)
library(choroplethrMaps)
drug_overdose = read_csv("./data/VSRR_Provisional_Drug_Overdose_Death_Counts.csv") %>%
janitor::clean_names()
state_level = c(state.name[1:8], "District of Columbia", state.name[9:32],"New York City", state.name[33:50])
drug_overdose_52 =
drug_overdose %>%
filter(!(state_name %in% c("United States"))) %>%
relocate(state_name) %>%
mutate(month = factor(month, levels = month.name), # change month and year to factor
year = factor(year),
state_name = factor(state_name, levels = state_level)) %>%
arrange(state_name) %>%
group_by(state_name, year) %>%
mutate(month = sort(month))
It’s hard to find drug-specific data for those state missing specific drug types. Opioids data is much easier to find. We should probably focus on opioids.
We drew a map of the opioids to justify our choice.
opioids_df =
drug_overdose_52 %>%
ungroup() %>%
select(state_name, year, month, indicator, data_value) %>%
filter(!(state_name %in% c("Alabama", "Arkansas", "Florida", "Idaho", "Louisiana", "Minnesota", "Nebraska", "North Dakota", "Pennsylvania")),
str_detect(indicator, "T4")) %>%
mutate(opioids_yn = ifelse(str_detect(indicator, "opioids"), TRUE, FALSE)) %>%
group_by(year, month, opioids_yn) %>%
summarize(opioids_rate = mean(data_value, na.rm = TRUE))
## `summarise()` has grouped output by 'year', 'month'. You can override using the `.groups` argument.
opioids_df %>%
ungroup() %>%
ggplot(aes(x = month, y = opioids_rate, group = opioids_yn, color = opioids_yn))+
geom_point()+
geom_line()+
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust= 1))+
facet_grid(.~year)
If we decided to focus on opioids, this will make my life much easier.
FL part: Overall review:
fl_death =
drug_overdose_52 %>%
filter(state_name %in% "Florida",
indicator %in% c("Number of Deaths", "Number of Drug Overdose Deaths")) %>%
select(year, month, indicator, deaths = data_value) %>%
pivot_wider(
names_from = indicator,
values_from = deaths
) %>%
janitor::clean_names() %>%
group_by(year, month) %>%
mutate(
percent_overdose_death = number_of_drug_overdose_deaths / number_of_deaths
)
## Adding missing grouping variables: `state_name`
fl_death %>%
ggplot(aes(x = month, y = percent_overdose_death, group = year, color = year))+
geom_point()+
geom_line()
fl_death %>%
ggplot(aes(x = month, y = percent_overdose_death, group = NA))+
geom_point()+
geom_line()+
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust= 1))+
facet_grid(.~year)
We can also focus just on opioids Data source: cdc wonder, request by queries, Drug overdose deaths are identified using underlying cause-of-death codes from the Tenth Revision of ICD (ICD–10): X40–X44 (unintentional), X60–X64 (suicide), X85 (homicide), and Y10–Y14 (undetermined). Opioids identified by selecting T40.0-40.4, 40.6
# read in data
fl_opi_death = read_csv("./data/fl_opi_death_15_19.csv") %>%
janitor::clean_names() %>%
select(state_name = state, year, month, deaths)
fl_opi_death_add = read_csv("./data/fl_opi_death_20_21.csv") %>%
janitor::clean_names() %>%
select(state_name = occurrence_state, year = year_code, month, deaths)
fl_opi_death = bind_rows(fl_opi_death, fl_opi_death_add) %>%
filter(!is.na(month)) %>%
separate(month, into = c("month", "useless"), sep = "\\,") %>%
select(-useless) %>%
mutate(month = substr(month, 1,3),
month = month.name[match(str_to_title(month), month.abb)],
month = factor(month, levels = month.name),
year = factor(year)) %>%
select(-state_name)
data could be suppressed as indicated by the cdc website, the data value also supported by florida government
But anyway let’s plot
First by year, this is the death count
fl_opi_death %>%
ggplot(aes(x = month, y = deaths, group = year, color = year))+
geom_point()+
geom_line()
what about each month within a year
fl_opi_death %>%
ggplot(aes(x = month, y = deaths, group = NA))+
geom_point()+
geom_line()+
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust= 1))+
facet_grid(.~year)
It’s climbing, every single month. The 2020 data is less probably because it’s provisional and some data still under investigation
Let’s analyze the number of death vs median household income
fl_eco_df =
read_csv("./data/median_household_income_fl.csv") %>%
janitor::clean_names() %>%
select(year, household_income_by_race, household_income_by_race_moe, geography) %>%
filter(
str_detect(geography, "FL|United States|Florida"),
year >= "2015") %>%
mutate(year = factor(year))
## Rows: 49 Columns: 7
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (3): Geography, ID Geography, Slug Geography
## dbl (4): ID Year, Year, Household Income by Race, Household Income by Race Moe
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
fl_eco_df %>%
mutate(text_label = str_c("Year: ", year, "\nMedian Household Income: $", household_income_by_race,
"\nMargin of error: ± $", household_income_by_race_moe)) %>%
plot_ly(
x = ~year, y = ~household_income_by_race, color = ~geography, text = ~text_label,
alpha = 0.5, type = "scatter", mode = "markers+lines", colors = "viridis", error_y = ~list(array = household_income_by_race_moe)) %>%
layout(
title = "Median Household Income: FLorida vs. The U.S",
xaxis = list(title = "Year"),
yaxis = list(title = "Median Household Income"))
Income and drug overdose death percent , by year
#patchwork
income_drug_df =
fl_death %>%
ungroup() %>%
group_by(year) %>%
summarize(overdose_death_rate = sum(number_of_drug_overdose_deaths)/sum(number_of_deaths)) %>%
inner_join(., fl_eco_df %>% filter(geography %in% "Florida"))
## Joining, by = "year"
income_drug_df %>%
ggplot(aes(x = household_income_by_race, y = overdose_death_rate, group = NA, color = year))+
geom_point()+
geom_line()
maps maps maps
fl_county_df =
read_csv("./data/NCHS_-_Drug_Poisoning_Mortality_by_County__United_States.csv") %>%
janitor::clean_names() %>%
filter(state %in% "Florida",
year >= 2015) %>%
select(year, county, population, death_rate = model_based_death_rate) %>%
separate(county, into = c("county", "useless"), sep = " County") %>%
select(-useless) %>%
mutate(year = factor(year),
county = str_to_lower(county)) %>%
relocate(county)
## Rows: 50176 Columns: 12
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (3): State, County, Urban/Rural Category
## dbl (9): FIPS, Year, FIPS State, Population, Model-based Death Rate, Standar...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
fl_county_df
## # A tibble: 268 × 4
## county year population death_rate
## <chr> <fct> <dbl> <dbl>
## 1 alachua 2015 259177 12.1
## 2 alachua 2016 264287 13.5
## 3 alachua 2017 266820 12.9
## 4 alachua 2018 269956 11.7
## 5 baker 2015 27351 21.9
## 6 baker 2016 27872 21.7
## 7 baker 2017 28230 24.2
## 8 baker 2018 28355 23.0
## 9 bay 2015 181387 17.6
## 10 bay 2016 183384 24.5
## # … with 258 more rows
data(county.fips)
abc = county.fips %>%
separate(polyname, into = c("state", "county"), sep = "\\,") %>%
filter(state %in% "florida") %>%
select(-state) %>%
as.tibble()
fl_county_df = left_join(fl_county_df,abc, by = "county") %>%
select(county, year, death_rate, fips) %>%
filter(year == 2015)
fips_add = c(12027, 12091, 12109, 12111)
a = 1
for (i in c(13,46,55,56)){
fl_county_df[i,4] = fips_add[a]
a = a+1
}
highlight_county = function(county_fips)
{
data(county.map, package="choroplethrMaps", envir=environment())
df = county.map[county.map$region %in% county_fips, ]
geom_polygon(data=df, aes(long, lat, group = group), color = "yellow", fill = NA, size = 1)
}
add_text_county = function(county_fips){
data(county.map, package="choroplethrMaps", envir=environment())
df = county.map[county.map$region %in% county_fips, ]
#geom_text(data=df, aes(mean(long), mean(lat), label = paste0(str_to_title(pull(county_fips, county)), " County\n", pull(county_fips, death_rate))), color = "white")
geom_label(data=df, aes(mean(long), mean(lat), label = paste0(str_to_title(pull(county_fips, county)), " County\nDR: ", round(pull(county_fips, death_rate),2))), fill = "white", size = 3)
}
fl_county_df %>%
group_by(fips) %>%
mutate(fips = as.numeric(fips)) %>%
rename(region = fips,
value = death_rate) %>%
county_choropleth(state_zoom = c("florida"),
legend = "death_rate")+
highlight_county(fl_county_df[which.max(pull(fl_county_df, death_rate)),])+
add_text_county(fl_county_df[which.max(pull(fl_county_df, death_rate)),])
ca_county_df =
read_csv("./data/NCHS_-_Drug_Poisoning_Mortality_by_County__United_States.csv") %>%
janitor::clean_names() %>%
filter(state %in% "California",
year >= 2015) %>%
select(year, county, population, death_rate = model_based_death_rate) %>%
separate(county, into = c("county", "useless"), sep = " County") %>%
select(-useless) %>%
mutate(year = factor(year),
county = str_to_lower(county)) %>%
relocate(county)
## Rows: 50176 Columns: 12
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (3): State, County, Urban/Rural Category
## dbl (9): FIPS, Year, FIPS State, Population, Model-based Death Rate, Standar...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
ca_county_df
## # A tibble: 232 × 4
## county year population death_rate
## <chr> <fct> <dbl> <dbl>
## 1 alameda 2015 1634634 12.3
## 2 alameda 2016 1650306 10.5
## 3 alameda 2017 1658131 8.79
## 4 alameda 2018 1666753 12.0
## 5 alpine 2015 1087 22.1
## 6 alpine 2016 1057 24.8
## 7 alpine 2017 1121 26.5
## 8 alpine 2018 1101 26.6
## 9 amador 2015 37037 23.1
## 10 amador 2016 37438 25.3
## # … with 222 more rows
def = county.fips %>%
separate(polyname, into = c("state", "county"), sep = "\\,") %>%
filter(state %in% "california") %>%
select(-state) %>%
as.tibble()
def
## # A tibble: 58 × 2
## fips county
## <int> <chr>
## 1 6001 alameda
## 2 6003 alpine
## 3 6005 amador
## 4 6007 butte
## 5 6009 calaveras
## 6 6011 colusa
## 7 6013 contra costa
## 8 6015 del norte
## 9 6017 el dorado
## 10 6019 fresno
## # … with 48 more rows
ca_county_df = left_join(ca_county_df,def, by = "county") %>%
select(county, year, death_rate, fips) %>%
filter(year == 2015)
ca_county_df %>%
group_by(fips) %>%
mutate(fips = as.numeric(fips)) %>%
rename(region = fips,
value = death_rate) %>%
county_choropleth(state_zoom = c("california"),
legend = "Death Rate") +
highlight_county(ca_county_df[which.max(pull(ca_county_df, death_rate)),])+
add_text_county(ca_county_df[which.max(pull(ca_county_df, death_rate)),])